In [1]:
import warnings
# Suppress specific FutureWarning
warnings.simplefilter("ignore", category=FutureWarning)
In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# Load the dataset (Make sure to replace the path with your local dataset path)
df = pd.read_csv("C:/Users/sarva/Desktop/kc_house_data.csv")
df
Out[2]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21608 | 263000018 | 20140521T000000 | 360000.0 | 3 | 2.50 | 1530 | 1131 | 3.0 | 0 | 0 | ... | 8 | 1530 | 0 | 2009 | 0 | 98103 | 47.6993 | -122.346 | 1530 | 1509 |
| 21609 | 6600060120 | 20150223T000000 | 400000.0 | 4 | 2.50 | 2310 | 5813 | 2.0 | 0 | 0 | ... | 8 | 2310 | 0 | 2014 | 0 | 98146 | 47.5107 | -122.362 | 1830 | 7200 |
| 21610 | 1523300141 | 20140623T000000 | 402101.0 | 2 | 0.75 | 1020 | 1350 | 2.0 | 0 | 0 | ... | 7 | 1020 | 0 | 2009 | 0 | 98144 | 47.5944 | -122.299 | 1020 | 2007 |
| 21611 | 291310100 | 20150116T000000 | 400000.0 | 3 | 2.50 | 1600 | 2388 | 2.0 | 0 | 0 | ... | 8 | 1600 | 0 | 2004 | 0 | 98027 | 47.5345 | -122.069 | 1410 | 1287 |
| 21612 | 1523300157 | 20141015T000000 | 325000.0 | 2 | 0.75 | 1020 | 1076 | 2.0 | 0 | 0 | ... | 7 | 1020 | 0 | 2008 | 0 | 98144 | 47.5941 | -122.299 | 1020 | 1357 |
21613 rows × 21 columns
In [3]:
df.head()
Out[3]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows × 21 columns
In [4]:
df.tail()
Out[4]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21608 | 263000018 | 20140521T000000 | 360000.0 | 3 | 2.50 | 1530 | 1131 | 3.0 | 0 | 0 | ... | 8 | 1530 | 0 | 2009 | 0 | 98103 | 47.6993 | -122.346 | 1530 | 1509 |
| 21609 | 6600060120 | 20150223T000000 | 400000.0 | 4 | 2.50 | 2310 | 5813 | 2.0 | 0 | 0 | ... | 8 | 2310 | 0 | 2014 | 0 | 98146 | 47.5107 | -122.362 | 1830 | 7200 |
| 21610 | 1523300141 | 20140623T000000 | 402101.0 | 2 | 0.75 | 1020 | 1350 | 2.0 | 0 | 0 | ... | 7 | 1020 | 0 | 2009 | 0 | 98144 | 47.5944 | -122.299 | 1020 | 2007 |
| 21611 | 291310100 | 20150116T000000 | 400000.0 | 3 | 2.50 | 1600 | 2388 | 2.0 | 0 | 0 | ... | 8 | 1600 | 0 | 2004 | 0 | 98027 | 47.5345 | -122.069 | 1410 | 1287 |
| 21612 | 1523300157 | 20141015T000000 | 325000.0 | 2 | 0.75 | 1020 | 1076 | 2.0 | 0 | 0 | ... | 7 | 1020 | 0 | 2008 | 0 | 98144 | 47.5941 | -122.299 | 1020 | 1357 |
5 rows × 21 columns
In [5]:
df.columns
Out[5]:
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15'],
dtype='object')
In [6]:
# Check for null values
df.isnull().sum()
Out[6]:
id 0 date 0 price 0 bedrooms 0 bathrooms 0 sqft_living 0 sqft_lot 0 floors 0 waterfront 0 view 0 condition 0 grade 0 sqft_above 0 sqft_basement 0 yr_built 0 yr_renovated 0 zipcode 0 lat 0 long 0 sqft_living15 0 sqft_lot15 0 dtype: int64
In [7]:
# Check for duplicate rows
df.duplicated().sum()
Out[7]:
0
In [8]:
# Check for any missing values in the entire DataFrame
if df.isnull().values.any():
print("Missing values found in the dataset.")
else:
print("No missing values found.")
No missing values found.
In [9]:
# Handle missing values (if any)
df.dropna(inplace=True)
In [10]:
# Find duplicate rows in the entire DataFrame
duplicate_rows = df[df.duplicated()].sum()
# Print the duplicate rows
print(duplicate_rows)
id 0 date 0 price 0.0 bedrooms 0 bathrooms 0.0 sqft_living 0 sqft_lot 0 floors 0.0 waterfront 0 view 0 condition 0 grade 0 sqft_above 0 sqft_basement 0 yr_built 0 yr_renovated 0 zipcode 0 lat 0.0 long 0.0 sqft_living15 0 sqft_lot15 0 dtype: object
In [11]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(df, column):
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
# Calculate IQR (Interquartile Range)
IQR = Q3 - Q1
# Define the lower and upper bounds for outliers
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR
return lower_limit, upper_limit
In [12]:
# Function to apply Winsorization (outlier capping)
def apply_winsorization(df, column, lower_limit, upper_limit):
# Cap the values outside the lower and upper limits
df[column] = np.where(df[column] < lower_limit, lower_limit, df[column])
df[column] = np.where(df[column] > upper_limit, upper_limit, df[column])
return df
In [13]:
# List of columns to check for outliers
columns_to_check = ['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built']
# For each column in columns_to_check, detect outliers and print summary before and after Winsorization
for col in columns_to_check:
# Calculate the lower and upper limits for outliers
lower_limit, upper_limit = detect_outliers_iqr(df, col)
# Print summary of the original column
print(f"Column: {col}")
print("Before Winsorization (sample):")
print(df[col].describe())
# Apply Winsorization (Capping outliers)
df_winsorized = apply_winsorization(df.copy(), col, lower_limit, upper_limit)
# Print summary of the capped column
print("After Winsorization (sample):")
print(df_winsorized[col].describe())
print("-" * 40)
Column: price Before Winsorization (sample): count 2.161300e+04 mean 5.400881e+05 std 3.671272e+05 min 7.500000e+04 25% 3.219500e+05 50% 4.500000e+05 75% 6.450000e+05 max 7.700000e+06 Name: price, dtype: float64 After Winsorization (sample): count 2.161300e+04 mean 5.115873e+05 std 2.500026e+05 min 7.500000e+04 25% 3.219500e+05 50% 4.500000e+05 75% 6.450000e+05 max 1.129575e+06 Name: price, dtype: float64 ---------------------------------------- Column: sqft_living Before Winsorization (sample): count 21613.000000 mean 2079.899736 std 918.440897 min 290.000000 25% 1427.000000 50% 1910.000000 75% 2550.000000 max 13540.000000 Name: sqft_living, dtype: float64 After Winsorization (sample): count 21613.000000 mean 2058.078564 std 839.307806 min 290.000000 25% 1427.000000 50% 1910.000000 75% 2550.000000 max 4234.500000 Name: sqft_living, dtype: float64 ---------------------------------------- Column: sqft_lot Before Winsorization (sample): count 2.161300e+04 mean 1.510697e+04 std 4.142051e+04 min 5.200000e+02 25% 5.040000e+03 50% 7.618000e+03 75% 1.068800e+04 max 1.651359e+06 Name: sqft_lot, dtype: float64 After Winsorization (sample): count 21613.000000 mean 8705.224448 std 5046.482073 min 520.000000 25% 5040.000000 50% 7618.000000 75% 10688.000000 max 19160.000000 Name: sqft_lot, dtype: float64 ---------------------------------------- Column: sqft_above Before Winsorization (sample): count 21613.000000 mean 1788.390691 std 828.090978 min 290.000000 25% 1190.000000 50% 1560.000000 75% 2210.000000 max 9410.000000 Name: sqft_above, dtype: float64 After Winsorization (sample): count 21613.000000 mean 1769.563041 std 764.029323 min 290.000000 25% 1190.000000 50% 1560.000000 75% 2210.000000 max 3740.000000 Name: sqft_above, dtype: float64 ---------------------------------------- Column: sqft_basement Before Winsorization (sample): count 21613.000000 mean 291.509045 std 442.575043 min 0.000000 25% 0.000000 50% 0.000000 75% 560.000000 max 4820.000000 Name: sqft_basement, dtype: float64 After Winsorization (sample): count 21613.000000 mean 284.050155 std 417.064559 min 0.000000 25% 0.000000 50% 0.000000 75% 560.000000 max 1400.000000 Name: sqft_basement, dtype: float64 ---------------------------------------- Column: yr_built Before Winsorization (sample): count 21613.000000 mean 1971.005136 std 29.373411 min 1900.000000 25% 1951.000000 50% 1975.000000 75% 1997.000000 max 2015.000000 Name: yr_built, dtype: float64 After Winsorization (sample): count 21613.000000 mean 1971.005136 std 29.373411 min 1900.000000 25% 1951.000000 50% 1975.000000 75% 1997.000000 max 2015.000000 Name: yr_built, dtype: float64 ----------------------------------------
In [14]:
# Function to plot boxplots before and after outlier treatment (individually)
def plot_individual_boxplots(df, columns):
for col in columns:
# Calculate the lower and upper limits for outliers
lower_limit, upper_limit = detect_outliers_iqr(df, col)
# Plot Before Winsorization (Original Data)
plt.figure(figsize=(6, 4))
sns.boxplot(x=df[col])
plt.title(f'{col} Before Winsorization')
plt.show()
# Apply Winsorization (Capping outliers)
df_winsorized = apply_winsorization(df.copy(), col, lower_limit, upper_limit)
# Plot After Winsorization (Winsorized Data)
plt.figure(figsize=(6, 4))
sns.boxplot(x=df_winsorized[col])
plt.title(f'{col} After Winsorization')
plt.show()
# Plot all columns before and after outlier treatment individually
plot_individual_boxplots(df, columns_to_check)
In [15]:
#Identify numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns
# Optionally, if you need to convert dates to numeric, you can do this here
# For example, if there's a 'date' column, we can convert it to datetime and then to the number of days or years
# Example for a 'date' column: df['date'] = pd.to_datetime(df['date'], errors='coerce')
# df['date'] = (df['date'] - df['date'].min()) # Convert to the number of days since the earliest date
# Create correlation matrix for only numeric columns
correlation_matrix = df[numeric_cols].corr()
# Plot correlation matrix heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
In [16]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
# Load dataset
df = pd.read_csv('kc_house_data.csv')
# Optional: Label Encoding categorical features if necessary
# For example, if 'waterfront' is a string, convert it to numerical
encoder = LabelEncoder()
df['waterfront'] = encoder.fit_transform(df['waterfront'])
In [17]:
fig = px.box(df, x="zipcode", y="price", title="1. Price Variation Across Different Neighborhoods",
color="zipcode",
labels={"zipcode": "Neighborhood (Zipcode)", "price": "Price"})
fig.update_layout(showlegend=False)
fig.show()
Analysis:¶
• The boxplot displays the median, IQR, and outliers for house prices by neighborhood.
• Larger boxes indicate more price variation, while whiskers show the price range.
• Outliers point to neighborhoods with extreme price values.
Insights:¶
• High-price neighborhoods show large IQRs and outliers, while affordable areas have smaller spreads.
2. How does the number of bedrooms affect house prices?¶
In [18]:
fig = px.scatter(
df, x="bedrooms", y="price", title="2. Price vs. Number of Bedrooms", trendline="ols", color="bedrooms"
)
# Customize layout and update title properties
fig.update_layout(
title={
"text": "2. Price vs. Number of Bedrooms", # Set the title text
"x": 0.5, # Center the title horizontally
"yanchor": "top", # Align the title to the top
"font": dict(size=24, family="Arial", color="darkblue"), # Set font size, family, and color
},
xaxis_title="Number of Bedrooms",
yaxis_title="Price",
)
# Update marker properties
fig.update_traces(marker=dict(size=12, opacity=0.6))
fig.show()
Analysis:¶
• The scatter plot with a trendline reveals the relationship between the number of bedrooms and price.
• The trendline (OLS) indicates a positive correlation, meaning that as the number of bedrooms increases, so does the price.
• Coloring by number of bedrooms helps highlight the variation in prices across different bedroom counts.
Insights:¶
• More bedrooms generally lead to higher house prices.
• Outliers and clusters can reveal specific patterns or neighborhoods where the price increase isn’t as steep.
3. What is the distribution of house prices in the dataset?¶
In [19]:
fig = px.histogram(df, x="price", nbins=50,
marginal="box", # Adding marginal box plot
color_discrete_sequence=["#636EFA"]) # Use one color for bins
fig.update_layout( title = ("3. Price Distribution vs Frequency"),
xaxis_title="Price ($)",
yaxis_title="Frequency",
title_x=0.5, # Center the title
title_font=dict(size=24, family="Arial", color="darkblue"), # Match font color and size
xaxis=dict(showgrid=True, gridcolor='lightgray', zeroline=False),
yaxis=dict(showgrid=True, gridcolor='lightgray', zeroline=False),
plot_bgcolor='white', # White background for a clean presentation
paper_bgcolor='white',
font=dict(family="Arial", size=14, color="DarkSlateGrey"),
)
# Show the updated plot
fig.show()
Analysis:¶
• The histogram provides a clear view of house price distribution, with marginal box plots for additional insights into the spread and central tendency.
• The price distribution shows a skewed distribution, indicating most houses fall within a certain price range with a few high-priced outliers.
• The use of the marginal box plot provides information on the median price, interquartile range, and presence of outliers.
Insights:¶
• Most house prices are concentrated around a mid-range value, but the presence of outliers (extremely high-priced homes) skews the distribution.
• The box plot gives insights into how widely prices are dispersed and where the majority of prices lie.
4. What are the relationships between key features in the dataset and how do they correlate with house prices?¶
In [20]:
# Set style for the plot
sns.set(style="whitegrid")
# Define the features (columns) you want to use in the pairplot
features = ['sqft_living', 'bedrooms', 'bathrooms', 'price', 'sqft_lot']
# Creating the pairplot
sns.pairplot(df[features],
hue='price',
palette='viridis', # Use 'viridis' for a smooth, color-blind friendly palette
plot_kws={'alpha': 0.7, 's': 70, 'edgecolor': 'w'}, # More transparency and styling
diag_kind='kde', # Use KDE for the diagonal histograms to show distribution
height=3) # Control the size of the plot
# Title and labels
plt.suptitle("4. Exploring Relationships Between Key Features", color="darkblue", fontsize=24)
plt.subplots_adjust(top=0.95) # Adjust title positioning to avoid overlap with plots
# Show the plot
plt.show()
Analysis:¶
• The pairplot visualizes pairwise relationships between multiple variables, with a smooth KDE (Kernel Density Estimate) on the diagonals to show the distribution of each feature.
• The hue based on price allows us to color-code the data points based on house price, revealing how each feature interacts with price.
Insights:¶
• Some features like square footage and number of bedrooms show a stronger relationship with price, while others may be less correlated.
• The KDE diagonal plots provide insights into the distribution of key variables and any skewness or trends within them.
5. How does house condition affect the price distribution across different house conditions?¶
In [21]:
plt.figure(figsize=(12, 6))
df['condition'] = df['condition'].astype('category')
# Creating a violin plot for house condition vs price with 'hue' set to 'condition'
sns.violinplot(data=df, x='condition', y='price', hue='condition', palette='Set2', legend=False)
# Title and labels
plt.title('5. Price Distribution Across Different House Conditions',color = "darkblue", fontsize=18)
plt.xlabel('House Condition', fontsize=14)
plt.ylabel('Price ($)', fontsize=14)
# Improve x-axis labels (no rotation needed)
plt.xticks(fontsize=12)
# Show the plot
plt.show()
Analysis:¶
• The violin plot provides a detailed view of the price distribution across various house conditions.
• The hue based on condition distinguishes between the conditions (e.g., good, fair, poor), allowing for easy comparison of how each condition affects price.
• The Set2 palette is used for clear visual distinction between different conditions, while violin plot shapes show the distribution and density of prices.
Insights:¶
• Houses with better conditions tend to have a higher price distribution, while those in poorer condition may have a wider range of prices but lower median values.
• The plot's distribution and density provide deeper insights into variability within each house condition.
6. How does the size (square footage) of a house affect its price?¶
In [22]:
import plotly.express as px
# Scatter plot with a trendline for square footage vs price
fig = px.scatter(df, x='sqft_living', y='price',
trendline="ols", # Adding a trendline to show correlation
title='6. Price vs Square Footage of House',
labels={'sqft_living': 'Square Footage (sqft)', 'price': 'Price ($)'},
color='sqft_living', # Color points by sqft_living
color_continuous_scale='Viridis', # Color scale for better visual appeal
template='plotly') # Use default light template for white background
# Update layout for cleaner presentation
fig.update_layout(
xaxis_title="Square Footage (sqft)",
yaxis_title="Price ($)",
plot_bgcolor='white', # Set white background for the plot area
title_x=0.5, # Center the title
title_font=dict(size=20, color='darkblue'),
font=dict(size=14, color='black'),
xaxis=dict(showgrid=True, gridcolor='lightgray'),
yaxis=dict(showgrid=True, gridcolor='lightgray')
)
fig.show()
Analysis:¶
• This scatter plot helps to visualize how larger homes tend to have higher prices.
• The trendline provides a clear indication of the positive correlation between square footage and price.
Insights:¶
• Positive Correlation: There’s a clear positive relationship between square footage and price, meaning larger homes are typically more expensive.
• Outliers: There might be some outliers where small homes are priced higher or large homes are priced lower, possibly due to factors like location, condition, or renovations.
• Data Distribution: The color gradient shows a range of square footage values, making it easier to see how larger homes are distributed across the price range.
7. How does the year of construction affect house prices?¶
In [23]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x="yr_built", y="price", hue="yr_built", palette="viridis", edgecolor='w', s=100)
plt.title("7. Price Distribution by Year of Construction", color = 'darkblue', fontsize=16)
plt.xlabel("Year Built", fontsize=12)
plt.ylabel("Price ($)", fontsize=12)
plt.legend(title='Year Built', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
Analysis:¶
• Scatter plot shows the relationship between year built and price.
• Color gradient helps differentiate houses by their construction year.
• Modern homes tend to have higher prices due to newer amenities, while older homes are generally priced lower unless renovated.
Insights:¶
• Newer homes generally fetch higher prices.
• Older homes may be priced lower, with some exceptions (luxury or renovated properties).
8. How does the average house price vary across different zip codes and with respect to the number of bedrooms?¶
In [24]:
import plotly.express as px
# Calculate average price per zipcode and per number of bedrooms
avg_price_data = df.groupby(['zipcode', 'bedrooms'], as_index=False)['price'].mean()
# Create a bar plot with price comparison across zipcodes and number of bedrooms
fig = px.bar(avg_price_data,
x="zipcode",
y="price",
color="bedrooms",
title="8. Average Price by Zipcode and Number of Bedrooms",
labels={"zipcode": "Neighborhood (Zipcode)", "price": "Average Price ($)", "bedrooms": "Number of Bedrooms"},
color_continuous_scale='Viridis', # Color scale for bedrooms
barmode='group' # Group bars by number of bedrooms
)
fig.update_layout(
xaxis_title="Zipcode",
yaxis_title="Average Price ($)",
title_x=0.5, # Center the title
title_font=dict(size=20, color='darkblue'),
xaxis=dict(showgrid=True, gridcolor='lightgray', tickangle=0),
yaxis=dict(showgrid=True, gridcolor='lightgray'),
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family="Arial", size=14, color="black"),
showlegend=True
)
fig.show()
Analysis:¶
• The bar plot shows how the average price of homes varies by zip code and the number of bedrooms.
• Different colors in the bars represent various bedroom counts, highlighting their impact on home prices across zip codes.
• It helps identify areas where homes with more bedrooms have significantly higher prices.
Insights:¶
• Higher Bedroom Count = Higher Price: Homes with more bedrooms generally have higher average prices.
• Geographical Influence: Certain zip codes, likely more affluent or desirable areas, consistently show higher home prices.
• Price Variation: Some zip codes show significant price variation, suggesting factors like location and amenities play a major role.
• Market Trends: The relationship between price and number of bedrooms indicates demand for larger homes in specific areas.
9. How does proximity to a waterfront affect house prices?¶
In [25]:
plt.figure(figsize=(10, 6))
# Creatimg a strip plot with hue based on 'waterfront'
sns.stripplot(x="waterfront", y="price", hue="waterfront", data=df, jitter=True, palette="Spectral", size=8, alpha=0.6)
# Title and labels
plt.title('9. Price Comparison Based on Proximity to Waterfront', color = 'darkblue',fontsize=18)
plt.xlabel('Waterfront (0: No, 1: Yes)', fontsize=14)
plt.ylabel('Price ($)', fontsize=14)
plt.xticks([0, 1], ['No', 'Yes'])
plt.legend(title='Waterfront', loc='upper left', bbox_to_anchor=(1.05, 1)) # Add legend
plt.show()
In [ ]:
Analysis:¶
• Waterfront houses (labeled as 'Yes') typically have higher prices compared to non-waterfront houses (labeled 'No').
• The plot shows variability in prices, with both waterfront and non-waterfront houses exhibiting a range of values.
Insights:¶
• Waterfront properties generally command higher prices.
• Price range for waterfront homes is wider, showing both affordable and premium waterfront houses.
• Non-waterfront properties tend to have a lower average price, but can still vary based on other features.
10. How does the number of floors in a house impact its price?¶
In [26]:
# Pivot data for heatmap visualization (using string 'mean' for aggregation)
floor_price_data = df.pivot_table(index="floors", values="price", aggfunc="mean")
plt.figure(figsize=(10, 6))
sns.heatmap(floor_price_data, annot=True, cmap="YlGnBu", cbar_kws={'label': 'Price ($)'}, linewidths=0.5)
plt.title("10. Price Variation by Number of Floors", color = 'darkblue', fontsize=16)
plt.xlabel("Number of Floors", fontsize=12)
plt.ylabel("Price ($)", fontsize=12)
plt.show()
Analysis:¶
• A heatmap is used to visualize the average price variation across different number of floors in the dataset.
• The pivot table shows the mean price for each number of floors, helping to identify the relationship between floor count and price.
• The color scale indicates price range, with darker shades corresponding to higher prices.
Insights:¶
• Houses with more floors tend to have higher average prices, though the variation between floors is minimal in some cases.
• Single-floor homes tend to have lower average prices compared to homes with multiple floors.
• The heatmap provides clear visual cues for how floor count is correlated with price.
Machine Learning Part:¶
In [27]:
# Importing necessary libraries for ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
# Preprocessing and feature selection
# For simplicity, let's use some important columns from the dataset
features = ['sqft_living', 'bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'yr_built']
# Selecting X (features) and y (target variable)
X = df[features]
y = df['price']
# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scaling the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Initializing the Linear Regression model
model = LinearRegression()
# Training the model
model.fit(X_train_scaled, y_train)
# Predictions on test data
y_pred = model.predict(X_test_scaled)
# Model evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Output the evaluation metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared: {r2:.2f}")
# Visualizing the predictions vs actual prices in a colorful way
plt.figure(figsize=(10, 6))
# Scatter plot with color representing the predicted values
plt.scatter(y_test, y_pred, c=y_pred, cmap='viridis', alpha=0.7, edgecolor='w', s=80)
# Plotting the ideal prediction line (where predicted = actual)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Ideal Prediction Line')
# Title and labels
plt.title("Predicted vs Actual Prices ", color = 'darkblue' , fontsize= 24)
plt.xlabel("Actual Prices ($)", fontsize=12)
plt.ylabel("Predicted Prices ($)", fontsize=12)
# Adding a color bar to show the range of predicted values
plt.colorbar(label='Predicted Price')
# Adding grid for better readability
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
# Show the plot
plt.legend(loc='upper left')
plt.show()
Mean Squared Error (MSE): 52585547066.12 R-squared: 0.65
In [ ]: